# Always print this out before your assignment
sessionInfo()
R version 4.1.2 (2021-11-01)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Catalina 10.15.7
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets
[6] methods base
other attached packages:
[1] here_1.0.1 knitr_1.36
loaded via a namespace (and not attached):
[1] compiler_4.1.2 fastmap_1.1.0 rprojroot_2.0.2
[4] tools_4.1.2 htmltools_0.5.2 yaml_2.2.1
[7] rmarkdown_2.11 xfun_0.28 digest_0.6.28
[10] rlang_0.4.12 evaluate_0.14
getwd()
[1] "/Users/ryanradcliff/Documents/BUS696/BROCODE_Final_Project"
# load all your libraries in this chunk
library('tidyverse')
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ───────────────────────────── tidyverse 1.3.1 ──
✓ ggplot2 3.3.5 ✓ purrr 0.3.4
✓ tibble 3.1.6 ✓ dplyr 1.0.7
✓ tidyr 1.1.4 ✓ stringr 1.4.0
✓ readr 2.1.0 ✓ forcats 0.5.1
── Conflicts ──────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library("fs")
library('here')
here() starts at /Users/ryanradcliff/Documents/BUS696
library('dplyr')
library('tidyverse')
library('ggplot2')
library('ggrepel')
library('ggthemes')
library('forcats')
library('rsample')
library('lubridate')
Attaching package: ‘lubridate’
The following objects are masked from ‘package:base’:
date, intersect, setdiff, union
library('ggthemes')
library('kableExtra')
Attaching package: ‘kableExtra’
The following object is masked from ‘package:dplyr’:
group_rows
library('pastecs')
Attaching package: ‘pastecs’
The following objects are masked from ‘package:dplyr’:
first, last
The following object is masked from ‘package:tidyr’:
extract
library('viridis')
Loading required package: viridisLite
library('plotly')
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
library('tidyquant')
Loading required package: PerformanceAnalytics
Loading required package: xts
Loading required package: zoo
Attaching package: ‘zoo’
The following objects are masked from ‘package:base’:
as.Date, as.Date.numeric
Attaching package: ‘xts’
The following objects are masked from ‘package:pastecs’:
first, last
The following objects are masked from ‘package:dplyr’:
first, last
Attaching package: ‘PerformanceAnalytics’
The following object is masked from ‘package:graphics’:
legend
Loading required package: quantmod
Loading required package: TTR
Registered S3 method overwritten by 'quantmod':
method from
as.zoo.data.frame zoo
══ Need to Learn tidyquant? ══════════════════════════════════════════
Business Science offers a 1-hour course - Learning Lab #9: Performance Analysis & Portfolio Optimization with tidyquant!
</> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
library('scales')
Attaching package: ‘scales’
The following object is masked from ‘package:viridis’:
viridis_pal
The following object is masked from ‘package:purrr’:
discard
The following object is masked from ‘package:readr’:
col_factor
# note, do not run install.packages() inside a code chunk. install them in the console outside of a code chunk.
1a) Loading data
#Reading the data in and doing minor initial cleaning in the function call
#Reproducible data analysis should avoid all automatic string to factor conversions.
#strip.white removes white space
#na.strings is a substitution so all that have "" will = na
data <- read.csv(here::here("final_project", "donor_data.csv"),
stringsAsFactors = FALSE,
strip.white = TRUE,
na.strings = "")
1b) Fixing the wonky DOB & Data cleanup
#(Birthdate and Age, ID as a number)adding DOB (Age/Spouse Age) in years columns and adding two fields for assignment and number of children
dataclean <- data %>%
mutate(Birthdate = ifelse(Birthdate == "0001-01-01", NA, Birthdate)) %>%
mutate(Birthdate = mdy(Birthdate)) %>%
mutate(Age = as.numeric(floor(interval(start= Birthdate, end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(Spouse.Birthdate = ifelse(Spouse.Birthdate == "0001-01-01", NA, Spouse.Birthdate)) %>%
mutate(Spouse.Birthdate = mdy(Spouse.Birthdate)) %>%
mutate(Spouse.Age = as.numeric(floor(interval(start= Spouse.Birthdate,
end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(ID = as.numeric(ID)) %>%
mutate(Assignment_flag = ifelse(is.na(Assignment.Number), 0,1)) %>%
mutate( No_of_Children = ifelse(is.na(Child.1.ID),0,
ifelse(is.na(Child.2.ID),1,2)))
#conferral dates
dataclean <- dataclean %>%
mutate(Conferral.Date.1 = ifelse(Conferral.Date.1 == "0001-01-01", NA, Conferral.Date.1)) %>%
mutate(Conferral.Date.1 = mdy(Conferral.Date.1)) %>%
mutate(Conferral.Date.1.Age = as.numeric(floor(interval(start= Conferral.Date.1, end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(Conferral.Date.2 = ifelse(Conferral.Date.2 == "0001-01-01", NA, Conferral.Date.2)) %>%
mutate(Conferral.Date.2 = mdy(Conferral.Date.2)) %>%
mutate(Conferral.Date.2.Age = as.numeric(floor(interval(start= Conferral.Date.2, end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(Last.Contact.By.Anyone = ifelse(Last.Contact.By.Anyone == "0001-01-01", NA, Last.Contact.By.Anyone)) %>%
mutate(Last.Contact.By.Anyone = mdy(Last.Contact.By.Anyone)) %>%
mutate(Last.Contact.Age = as.numeric(floor(interval(start= Last.Contact.By.Anyone, end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(HH.First.Gift.Date = ifelse(HH.First.Gift.Date == "0001-01-01", NA, HH.First.Gift.Date)) %>%
mutate(HH.First.Gift.Date = mdy(HH.First.Gift.Date)) %>%
mutate(HH.First.Gift.Age = as.numeric(floor(interval(start= HH.First.Gift.Date, end=Sys.Date())/duration(n=1, unit="years"))))
#major gift
dataclean <-
dataclean %>%
mutate(major_gifter = ifelse(Lifetime.Giving > 50000, 1,0) %>% factor(., levels = c("0","1")))
#splitting up the age into ranges and creating category for easy visualization
dataclean <- dataclean %>%
mutate(age_range =
ifelse(Age %in% 10:19, "10 < 20 years old",
ifelse(Age %in% 20:29, "20 < 30 years old",
ifelse(Age %in% 30:39, "30 < 40 years old",
ifelse(Age %in% 40:49, "40 < 50 years old",
ifelse(Age %in% 50:59, "50 < 60 years old",
ifelse(Age %in% 60:69, "60 < 70 years old",
ifelse(Age %in% 70:79, "70 < 80 years old",
ifelse(Age %in% 80:89, "80 < 90 years old",
ifelse(Age %in% 90:120, "90+ years old",
NA))))))))))
#seeing what we have
table(dataclean$age_range)
10 < 20 years old 20 < 30 years old 30 < 40 years old
3990 24560 21034
40 < 50 years old 50 < 60 years old 60 < 70 years old
16857 20748 18258
70 < 80 years old 80 < 90 years old 90+ years old
12242 5986 6632
#50-60 is the most common age range
#creating a region column using the county data and the OMB MSA (Metropolitan Statistical Area) definitions
dataclean <- dataclean %>%
mutate(region =
ifelse(County == "San Luis Obispo" & State == "CA", "So Cal",
ifelse(County == "Kern" & State == "CA", "So Cal",
ifelse(County == "San Bernardino" & State == "CA", "So Cal",
ifelse(County == "Santa Barbara" & State == "CA", "So Cal",
ifelse(County == "Ventura" & State == "CA", "So Cal",
ifelse(County == "Los Angeles" & State == "CA", "So Cal",
ifelse(County == "Orange" & State == "CA", "So Cal",
ifelse(County == "Riverside" & State == "CA", "So Cal",
ifelse(County == "San Diego" & State == "CA", "So Cal",
ifelse(County == "Imperial" & State == "CA", "So Cal",
ifelse(County == "King" & State == "WA", "Seattle",
ifelse(County == "Snohomish" & State == "WA", "Seattle",
ifelse(County == "Pierce" & State == "WA", "Seattle",
ifelse(County == "Clackamas" & State == "OR", "Portland",
ifelse(County == "Columbia" & State == "OR", "Portland",
ifelse(County == "Multnomah" & State == "OR", "Portland",
ifelse(County == "Washington" & State == "OR", "Portland",
ifelse(County == "Yamhill" & State == "OR", "Portland",
ifelse(County == "Clark" & State == "WA", "Portland",
ifelse(County == "Skamania" & State == "WA", "Portland",
ifelse(County == "Denver" & State == "CO", "Denver",
ifelse(County == "Arapahoe" & State == "CO", "Denver",
ifelse(County == "Jefferson" & State == "CO", "Denver",
ifelse(County == "Adams" & State == "CO", "Denver",
ifelse(County == "Douglas" & State == "CO", "Denver",
ifelse(County == "Broomfield" & State == "CO", "Denver",
ifelse(County == "Elbert" & State == "CO", "Denver",
ifelse(County == "Park" & State == "CO", "Denver",
ifelse(County == "Clear Creek" & State == "CO", "Denver",
ifelse(County == "Alameda" & State == "CA", "Bay Area",
ifelse(County == "Contra Costa" & State == "CA", "Bay Area",
ifelse(County == "Marin" & State == "CA", "Bay Area",
ifelse(County == "Monterey" & State == "CA", "Bay Area",
ifelse(County == "Napa" & State == "CA", "Bay Area",
ifelse(County == "San Benito" & State == "CA", "Bay Area",
ifelse(County == "San Francisco" & State == "CA", "Bay Area",
ifelse(County == "San Mateo" & State == "CA", "Bay Area",
ifelse(County == "Santa Clara" & State == "CA", "Bay Area",
ifelse(County == "Santa Cruz" & State == "CA", "Bay Area",
ifelse(County == "Solano" & State == "CA", "Bay Area",
ifelse(County == "Sonoma" & State == "CA", "Bay Area",
NA))))))))))))))))))))))))))))))))))))))))))
dataclean <- dataclean %>%
mutate(region =
ifelse(County == "Kings" & State == "NY", "New York",
ifelse(County == "Queens" & State == "NY", "New York",
ifelse(County == "New York" & State == "NY", "New York",
ifelse(County == "Bronx" & State == "NY", "New York",
ifelse(County == "Richmond" & State == "NY", "New York",
ifelse(County == "Westchester" & State == "NY", "New York",
ifelse(County == "Bergen" & State == "NY", "New York",
ifelse(County == "Hudson" & State == "NY", "New York",
ifelse(County == "Passaic" & State == "NY", "New York",
ifelse(County == "Putnam" & State == "NY", "New York",
ifelse(County == "Rockland" & State == "NY", "New York",
ifelse(County == "Suffolk" & State == "NY", "New York",
ifelse(County == "Nassau" & State == "NY", "New York",
ifelse(County == "Middlesex" & State == "NJ", "New York",
ifelse(County == "Monmouth" & State == "NJ", "New York",
ifelse(County == "Ocean" & State == "NJ", "New York",
ifelse(County == "Somerset" & State == "NJ", "New York",
ifelse(County == "Essex" & State == "NJ", "New York",
ifelse(County == "Union" & State == "NJ", "New York",
ifelse(County == "Morris" & State == "NJ", "New York",
ifelse(County == "Sussex" & State == "NJ", "New York",
ifelse(County == "Hunterdon" & State == "NJ", "New York",
ifelse(County == "Pike" & State == "NJ", "New York",
region))))))))))))))))))))))))
# code nor cal region as all others in CA not already defined
dataclean <- dataclean %>%
mutate(region =
ifelse(State == "CA" & is.na(region) == TRUE, "Nor Cal", region))
#Removing Columns that provide no benefit
dataclean <- subset(dataclean,select = -c(Assignment.Number
,Assignment.has.Historical.Mngr
,Suffix
,Assignment.Date
,Assignment.Manager
,Assignment.Role
,Assignment.Title
,Assignment.Status
,Strategy
,Progress.Level
,Assignment.Group
,Assignment.Category
,Funding.Method
,Expected.Book.Date
,Qualification.Amount
,Expected.Book.Amount
,Expected.Book.Date
,Hard.Gift.Total
,Soft.Credit.Total
,Total.Assignment.Gifts
,No.of.Pledges
,Proposal..
,Proposal.Notes
,HH.Life.Spouse.Credit
,Last.Contact.By.Manager
,X..of.Contacts.By.Manager
,DonorSearch.Range
,iWave.Range
,WealthEngine.Range
,Philanthropic.Commitments
))
#cleaning up zip codes removing -4 after
dataclean$Zip <- gsub(dataclean$Zip, pattern="-.*", replacement = "")
#adding zip code data and column
zip <- read.csv(here::here("final_project", "Salary_Zipcode.csv"),
stringsAsFactors = FALSE,
strip.white = TRUE,
na.strings = "")
#adding zip salary column
dataclean <-dataclean %>%
mutate(zipcode_slry = VLOOKUP(Zip, zip, NAME, S1902_C03_002E))
#slry range
dataclean <- dataclean %>%
mutate(zipslry_range =
ifelse(zipcode_slry %in% 10000:89999, "90K-99K",
ifelse(zipcode_slry %in% 90000:99999, "90K-99K",
ifelse(zipcode_slry %in% 100000:149999, "100K-149K",
ifelse(zipcode_slry %in% 150000:199999, "150K-199K",
ifelse(zipcode_slry %in% 200000:249999, "200K-249K",
ifelse(zipcode_slry %in% 250000:299999, "250K-299K",
ifelse(zipcode_slry %in% 300000:349999, "300K-349K",
ifelse(zipcode_slry %in% 350000:399999, "350K-399K",
ifelse(zipcode_slry %in% 400000:499999, "400K-499K",
ifelse(zipcode_slry %in% 500000:999999, "500K-999K",
NA)))))))))))
sum(is.na(dataclean$zipcode_slry))
[1] 62347
#adding scholarship data (y/n)
schlr <- read.csv(here::here("final_project", "scholarship.csv"),
stringsAsFactors = FALSE,
strip.white = TRUE,
na.strings = "")
#adding scholarship column
dataclean <-dataclean %>%
mutate(scholarship = VLOOKUP(ID, schlr, ID, SCHOLARSHIP))
#replacing NA with 0
dataclean$scholarship <- replace_na(dataclean$scholarship,'0')
#replacing Y with 1
dataclean$scholarship<-ifelse(dataclean$scholarship=="Y",1,0)
#checking how many are N
table(dataclean$scholarship)
0 1
295264 27962
#checking and deleting scholarship column
class(dataclean$schlr_fct)
[1] "NULL"
dataclean = subset(dataclean, select = -c(scholarship))
#checking for duplicates N >1 indicates a records values are in the file twice
dataclean %>% group_by(ID) %>% count() %>% arrange(desc(n))
#removing duplicated records
dataclean <- unique(dataclean)
#n = 1 no ID with multiple records cleaned of dupes
dataclean %>% group_by(ID) %>% count() %>% arrange(desc(n))
NA
1d Creating many many factor variables
dataclean <-
dataclean %>%
#SEX
mutate(sex_fct =
fct_explicit_na(Sex),
sex_simple =
fct_lump_n(Sex, n = 4),
#MARRIED
married_fct =
fct_explicit_na(Married),
#DONOR SEGMENT
donorseg_fct =
fct_explicit_na(Donor.Segment),
donorseg_simple =
fct_lump_n(Donor.Segment, n = 4),
#CONTACT RULE
contact_fct =
fct_explicit_na(Contact.Rules),
contact_simple =
fct_lump_n(Contact.Rules, n = 4),
#SPOUSE MAIL
spomail_fct =
fct_explicit_na(Spouse.Mail.Rules),
spomail_simple =
fct_lump_n(Spouse.Mail.Rules, n = 4),
#JOB TITLE
jobtitle_fct =
fct_explicit_na(Job.Title),
jobtitle_simple =
fct_lump_n(Job.Title, n = 5),
#DEGREE TYPE 1
deg1_fct =
fct_explicit_na(Degree.Type.1),
deg1_simple =
fct_lump_n(Degree.Type.1, n = 5),
#DEGREE TYPE 2
deg2_fct =
fct_explicit_na(Degree.Type.2),
deg2_simple =
fct_lump_n(Degree.Type.2, n = 5),
#MAJOR 1
maj1_fct =
fct_explicit_na(Major.1),
maj1_simple =
fct_lump_n(Major.1, n = 5),
#MAJOR 2
maj2_fct =
fct_explicit_na(Major.2),
maj2_simple =
fct_lump_n(Major.2, n = 5),
#MINOR 1
min1_fct =
fct_explicit_na(Minor.1),
min1_simple =
fct_lump_n(Minor.1, n = 5),
#MINOR 2
min2_fct =
fct_explicit_na(Minor.2),
min2_simple =
fct_lump_n(Minor.2, n = 5),
#SCHOOL 1
school1_fct =
fct_explicit_na(School.1),
school1_simple =
fct_lump_n(School.1, n = 5),
#SCHOOL 2
school2_fct =
fct_explicit_na(School.2),
school2_simple =
fct_lump_n(School.2, n = 5),
#INSTITUTION TYPE
insttype_fct =
fct_explicit_na(Institution.Type),
insttype_simple =
fct_lump_n(Institution.Type, n = 5),
#EXTRACURRICULAR
extra_fct =
fct_explicit_na(Extracurricular),
extra_simple =
fct_lump_n(Extracurricular, n = 5),
#HH FIRST GIFT FUND
hhfirstgift_fct =
fct_explicit_na(HH.First.Gift.Fund),
hhfirstgift_simple =
fct_lump_n(HH.First.Gift.Fund, n = 5),
#CHILD 1 ENROLL STATUS
ch1_enroll_fct =
fct_explicit_na(Child.1.Enroll.Status),
ch1_enroll_simple =
fct_lump_n(Child.1.Enroll.Status, n = 4),
#CHILD 1 MAJOR
ch1_maj_fct =
fct_explicit_na(Child.1.Major),
ch1_maj_simple =
fct_lump_n(Child.1.Major, n = 4),
#CHILD 1 MINOR
ch1_min_fct =
fct_explicit_na(Child.1.Minor),
ch1_min_simple =
fct_lump_n(Child.1.Minor, n = 4),
#CHILD 1 SCHOOL
ch1_school_fct =
fct_explicit_na(Child.1.School),
ch1_school_simple =
fct_lump_n(Child.1.School, n = 4),
#CHILD 1 FEEDER
ch1_feeder_fct =
fct_explicit_na(Child.1.Feeder.School),
ch1_feeder_simple =
fct_lump_n(Child.1.Feeder.School, n = 4),
#CHILD 2 ENROLL STATUS
ch1_enroll_fct =
fct_explicit_na(Child.2.Enroll.Status),
ch2_enroll_simple =
fct_lump_n(Child.2.Enroll.Status, n = 4),
#CHILD 2 MAJOR
ch2_maj_fct =
fct_explicit_na(Child.2.Major),
ch2_maj_simple =
fct_lump_n(Child.2.Major, n = 4),
#CHILD 2 MINOR
ch2_min_fct =
fct_explicit_na(Child.2.Minor),
ch2_min_simple =
fct_lump_n(Child.2.Minor, n = 4),
#CHILD 2 SCHOOL
ch2_school_fct =
fct_explicit_na(Child.2.School),
ch2_school_simple =
fct_lump_n(Child.2.School, n = 4),
#CHILD 2 FEEDER
ch2_feeder_fct =
fct_explicit_na(Child.2.Feeder.School),
ch2_feeder_simple =
fct_lump_n(Child.2.Feeder.School, n = 4),
)
#checking to see if its a factor
#class(dataclean$sex_fct)
#class(dataclean$donorseg_fct)
#class(dataclean$contact_fct)
#class(dataclean$spomail_fct)
#checking levels
#levels(dataclean$sex_simple)
#levels(dataclean$donorseg_simple)
#levels(dataclean$contact_simple)
#levels(dataclean$spomail_simple)
#levels(dataclean$hhfirstgift_simple)
#creating a table against Sex column
#table(dataclean$sex_fct, dataclean$sex_simple)
DonorSegment Analysis
#grouping by donorsegment and analyzing
dataclean %>%
group_by(Donor.Segment) %>%
summarise(Count = length(Donor.Segment),
mean_total_giv = mean(HH.Lifetime.Giving)) %>%
arrange(-Count) %>%
filter(Count >= 100) %>%
#added scales package to have the values show in dollar
mutate(mean_total_giv = dollar(mean_total_giv)) %>%
kable(col.names = c("Donor Segment", "Count", "Mean HH Lifetime Giving"), align=rep('c', 3)) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F)
| Donor Segment | Count | Mean HH Lifetime Giving |
|---|---|---|
| NA | 231974 | $0.00 |
| Lost Donor | 69718 | $4,954.47 |
| Lapsed Donor | 11193 | $10,069.79 |
| Current Donor | 5603 | $90,638.32 |
| Lapsing Donor | 3862 | $16,590.15 |
| At-Risk Donor | 650 | $77,143.93 |
NA
NA
First gift size
aq <- quantile(dataclean$HH.First.Gift.Amount, probs = c(.25,.50,.75,.9,.99), na.rm = TRUE)
aq <- as.data.frame(aq)
aq$aq <- dollar(aq$aq)
aq %>%
kable(col.names = "Quantile") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F)
| Quantile | |
|---|---|
| 25% | $0.00 |
| 50% | $0.00 |
| 75% | $0.00 |
| 90% | $40.00 |
| 99% | $1,910.06 |
NA
NA
Consecutive giving
#consecutive years of giving
dataclean %>%
filter(Max.Consec.Fiscal.Years > 0) %>%
ggplot(aes(Max.Consec.Fiscal.Years)) + geom_histogram(fill = "#002845", bins = 20) +
theme_economist_white() +
ggtitle("Consecutive Years of Giving Distribution") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,120,2)) +
scale_y_continuous(breaks = seq(0,10000000,5000))
NA
NA
NA
Lifetime giving based on number of children
dataclean %>%
filter(HH.Lifetime.Giving <= 10000) %>%
filter(HH.Lifetime.Giving > 0) %>%
mutate(`No_of_Children` = as.factor(`No_of_Children`)) %>%
ggplot(aes(HH.Lifetime.Giving, fill = `No_of_Children`)) + geom_histogram(bins = 30) + theme_economist_white() +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,100000,1000)) +
scale_y_continuous(breaks = seq(0,100000000,5000)) +
ggtitle("Giving distribution and number of children")+
scale_fill_manual(values=c("#002845", "#00cfcc", "#ff9973"))
NA
NA
NA
Mean, Median, and Count of Giving in Age Ranges
age_range_giving <- dataclean %>%
group_by(age_range) %>%
summarise(avg_giving = mean(HH.Lifetime.Giving, na.rm = TRUE),
med_giving = median(HH.Lifetime.Giving, na.rm = TRUE),
amount_of_people_in_age_range = n())
glimpse(age_range_giving)
Rows: 10
Columns: 4
$ age_range <chr> "10 < 20 years old", "20 < 30 …
$ avg_giving <dbl> 0.4449699, 28.2733731, 391.223…
$ med_giving <dbl> 0, 0, 0, 0, 0, 0, 0, 10, 15, 0
$ amount_of_people_in_age_range <int> 3990, 24553, 21021, 16837, 207…
2a) Plotting average giving by age range
age_range_giving <-
age_range_giving %>%
mutate(age_range = factor(age_range))
ggplot(age_range_giving, aes(age_range, avg_giving)) +
geom_bar(stat = "identity")+
theme(axis.text.x = element_text(angle=45,
hjust=1))
NA
NA
2b) Count of donors based on age range (another way to look at it)
ggplot(dataclean,
aes(age_range)) +
geom_bar() +
theme(axis.text.x = element_text(angle=45,
hjust=1)) +
labs(title = "Count of Age Ranges", x = "", y = "")
NA
NA
2c) Boxplot of the Age Ranges Against the Lifetime Giving Amounts with a log scale applied - the reason we applied log scale is to resolve issues with visualizations that skew towards large values in our dataset.
ggplot(dataclean, aes(age_range,HH.Lifetime.Giving,fill = age_range)) +
geom_boxplot(
outlier.colour = "red") +
scale_y_log10() +
theme(axis.text.x=element_text(angle=45,hjust=1))
NA
NA
2d) Splitting by age and gender
#creating boxplots
dataclean %>%
filter(Age < 100) %>% #removing the weird outliers that are over 100
filter(Sex %in% c("M", "F")) %>%
ggplot(aes(Sex, Age)) +
geom_boxplot() +
theme_economist() +
ggtitle("Ages of Donors Based on Gender") +
xlab(NULL) + ylab(NULL)
NA
NA
Giving by gender
#remove NAs U X
q <- ggplot(dataclean)
q + stat_summary_bin(
aes(y = HH.Lifetime.Giving, x = sex_simple),
fun.y = "mean", geom = "bar")
summary(dataclean$sex_simple)
F M U X NA's
120781 108190 3683 7 90339
Mean age by gender
#breakdown of sexs
tally(group_by(dataclean, Sex))
summarize(group_by(dataclean, Sex),
avg_giving = mean(HH.Lifetime.Giving, na.rm = TRUE),
avg_age = mean(Age, na.rm = TRUE),
med_age = median(Age, na.rm = TRUE))
#grouping by sex and age range for slides
tally(group_by(dataclean, Sex, age_range))
NA
NA
NA
2e) Distribution of people in the states that they live.
dataclean %>%
mutate(State = ifelse(State == " ", "NA", State)) %>%
filter(State != "NA") %>%
group_by(State) %>%
summarise(Count = length(State)) %>%
filter(Count > 800) %>%
arrange(-Count) %>%
kable(col.names = c("Donor's State", "Count")) %>%
kable_styling(bootstrap_options = c("condensed"),
full_width = F)
| Donor's State | Count |
|---|---|
| CA | 176487 |
| WA | 7957 |
| TX | 7266 |
| NY | 5659 |
| CO | 5073 |
| AZ | 4925 |
| OR | 4612 |
| FL | 4111 |
| IL | 3681 |
| HI | 3394 |
| PA | 2904 |
| OH | 2754 |
| NV | 2715 |
| MI | 2523 |
| MA | 2473 |
| NJ | 2311 |
| VA | 2158 |
| NC | 2085 |
| GA | 2044 |
| MO | 1889 |
| MN | 1732 |
| MD | 1488 |
| TN | 1443 |
| IN | 1417 |
| CT | 1380 |
| WI | 1330 |
| UT | 1173 |
| OK | 1151 |
| AL | 1120 |
| LA | 1110 |
| ID | 1096 |
| SC | 1076 |
| KY | 1032 |
| KS | 1027 |
| NM | 981 |
| IA | 880 |
NA
NA
NA
NA
NA
NA
2f) Looking at all donors first gift amount. 75% made a first gift of <100.
no_non_donors <- dataclean %>%
filter(Lifetime.Giving != 0)
nd <- quantile(no_non_donors$HH.First.Gift.Amount, probs = c(.25,.50,.75,.9,.99), na.rm = TRUE)
nd <- as.data.frame(nd)
nd %>%
kable(col.names = "Quantile") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F)
| Quantile | |
|---|---|
| 25% | 3.8 |
| 50% | 25.0 |
| 75% | 100.0 |
| 90% | 500.0 |
| 99% | 15000.0 |
NA
NA
NA
NA
Split data
#converting married Y and N to 1 and 0
dataclean <- dataclean %>%
mutate(Married_simple = ifelse(Married == "N",0,1))
dataclean <- dataclean %>%
mutate(hh.lifetime.giving_fct = as.factor(HH.Lifetime.Giving))
library("rsample")
data_split <- initial_split(dataclean, prop = 0.75)
data_train <- training(data_split)
data_test <- testing(data_split)
p <- dataclean %>%
ggplot(aes(Age)) + geom_histogram(bins=30, fill = "blue") + theme_economist_white() +
ggtitle("Overall Donor Age Distribution") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(5,100,by = 20)) +
scale_y_continuous(breaks = seq(20,100,by = 20)) + xlim(c(20,100))
Scale for 'x' is already present. Adding another scale for 'x',
which will replace the existing scale.
ggplotly(p)
p
ggplot(data = dataclean, aes(x = Age)) + geom_histogram(fill ="blue")+ xlim(c(20,100))
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
NA
NA
NA
Another Histogram
dataclean %>%
filter(Age >= 10) %>%
filter(Age <= 90) %>%
ggplot(aes(Age)) + geom_histogram(fill = "#002845", bins = 20) + theme_economist_white() +
ggtitle("Overall Donor Age Distribution") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,120,5)) +
scale_y_continuous(breaks = seq(0,10000000,2000))
Age distribution by gender
#Age Gender filtered out below 15 and above 90 - also removed U X the weird values
dataclean %>%
filter(Age >= 15) %>%
filter(Age <= 90) %>%
mutate(Sex = as.factor(Sex)) %>%
filter(Sex != "U") %>%
filter(Sex != "X") %>%
ggplot(aes(Age, fill = Sex)) + geom_histogram(bins = 25) + theme_economist_white() +
ggtitle("Age Distribution by Gender") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,120,10)) +
scale_y_continuous(breaks = seq(0,50000,2000)) + scale_fill_manual(values=c("#ff9973", "#00cfcc"))
Donor age distribution by marital status
#Age Marital Status
dataclean %>%
filter(Age >= 20) %>%
filter(Age <= 85) %>%
ggplot(aes(Age, fill = Married)) + geom_histogram(bins = 25) + theme_economist_white() +
ggtitle("Overall Donor Age Distribution by Marital Status") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,120,5)) +
scale_y_continuous(breaks = seq(0,50000,2000)) + scale_fill_manual(values=c("#ff9973", "#00cfcc"))
Linear Model
ggplot(data = data_train, aes(x = Age, y = log(Lifetime.Giving))) + geom_point(alpha = 1/10) + geom_smooth(method = lm) + facet_wrap(~No_of_Children) + theme_clean(base_size = 8) + labs(x = "X", y = "Y") +
ggtitle("Title")
`geom_smooth()` using formula 'y ~ x'
MORE MODELS
Big logistic model
# Set family to binomial to set logistic function
# Run the model on the training set
donor_logit1 <-
glm(hh.lifetime.giving_fct ~ Married_simple,
family = "binomial",
data = data_train)
summary(donor_logit1)
Call:
glm(formula = hh.lifetime.giving_fct ~ Married_simple, family = "binomial",
data = data_train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.1122 -0.6872 -0.6872 1.2440 1.7659
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.323203 0.005906 -224.1 <0.0000000000000002 ***
Married_simple 1.167887 0.009628 121.3 <0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 288365 on 242249 degrees of freedom
Residual deviance: 273662 on 242248 degrees of freedom
AIC: 273666
Number of Fisher Scoring iterations: 4
donor_logit2 <-
glm(hh.lifetime.giving_fct ~ No_of_Children,
family = "binomial",
data = data_train)
summary(donor_logit2)
Call:
glm(formula = hh.lifetime.giving_fct ~ No_of_Children, family = "binomial",
data = data_train)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.9083 -0.8000 -0.8000 1.5411 1.6094
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.975131 0.005212 -187.10 <0.0000000000000002 ***
No_of_Children 0.151469 0.009049 16.74 <0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 288365 on 242249 degrees of freedom
Residual deviance: 288089 on 242248 degrees of freedom
AIC: 288093
Number of Fisher Scoring iterations: 4
#summary(data_train$major_gifter)
donor_logit3 <-
glm(major_gifter ~ Married_simple + No_of_Children + donorseg_simple + Assignment_flag + Total.Giving.Years,
family = "binomial",
data = data_train)
summary(donor_logit3)
Call:
glm(formula = major_gifter ~ Married_simple + No_of_Children +
donorseg_simple + Assignment_flag + Total.Giving.Years, family = "binomial",
data = data_train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.4932 -0.1410 -0.1220 -0.0873 3.5040
Coefficients:
Estimate Std. Error z value
(Intercept) -4.08066 0.24408 -16.718
Married_simple -1.23973 0.08749 -14.170
No_of_Children 0.71530 0.05765 12.407
donorseg_simpleCurrent Donor -0.04313 0.24679 -0.175
donorseg_simpleLapsed Donor -0.60244 0.25434 -2.369
donorseg_simpleLapsing Donor -0.40470 0.26802 -1.510
donorseg_simpleLost Donor -0.96219 0.24415 -3.941
Assignment_flag 1.19911 0.11744 10.210
Total.Giving.Years 0.14559 0.00441 33.013
Pr(>|z|)
(Intercept) < 0.0000000000000002 ***
Married_simple < 0.0000000000000002 ***
No_of_Children < 0.0000000000000002 ***
donorseg_simpleCurrent Donor 0.8613
donorseg_simpleLapsed Donor 0.0179 *
donorseg_simpleLapsing Donor 0.1311
donorseg_simpleLost Donor 0.0000811 ***
Assignment_flag < 0.0000000000000002 ***
Total.Giving.Years < 0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 9764.4 on 68404 degrees of freedom
Residual deviance: 7544.5 on 68396 degrees of freedom
(173845 observations deleted due to missingness)
AIC: 7562.5
Number of Fisher Scoring iterations: 8
exp(donor_logit3$coefficients)
(Intercept) Married_simple
0.01689628 0.28946282
No_of_Children donorseg_simpleCurrent Donor
2.04479757 0.95778285
donorseg_simpleLapsed Donor donorseg_simpleLapsing Donor
0.54747539 0.66717987
donorseg_simpleLost Donor Assignment_flag
0.38205405 3.31714962
Total.Giving.Years
1.15672540
#training predictions for in sample preds
preds_train <- predict(donor_logit3, newdata = data_train, type = "response")
#test predicts for OOS (out of sample)
preds_test <- predict(donor_logit3, newdata = data_test, type = "response")
head(preds_train)
196454 181024 64789 23864 86886
NA 0.003806297 NA NA NA
188620
NA
head(preds_test)
1 6 17 18 21
0.047220650 0.764137284 0.931364886 0.001865082 0.040099014
27
0.188797205
results_train <- data.frame(
`truth` = data_train %>% select(major_gifter) %>%
mutate(major_gifter = as.numeric(major_gifter)),
`Class1` = preds_train,
`type` = rep("train",length(preds_train))
)
results_test <- data.frame(
`truth` = data_test %>% select(major_gifter) %>%
mutate(major_gifter = as.numeric(major_gifter)),
`Class1` = preds_test,
`type` = rep("test",length(preds_test))
)
results <- bind_rows(results_train,results_test)
dim(results_train)
[1] 242250 3
dim(results_test)
[1] 80750 3
dim(results)
[1] 323000 3
library('plotROC')
p_plot <-
ggplot(results,
aes(m = Class1, d = major_gifter, color = type)) +
geom_roc(labelsize = 2.5,
#Took the labelsize down to avoid cutoff
cutoffs.at = c(0.7,0.5,0.3,0.1,0)) +
#We removed some of the cutoffs to avoid the mashup near the origin.
#Changed the theme to avoid cutoff plot values.
theme_classic(base_size = 14) +
labs(x = "False Positive Rate",
y = "True Positive Rate") +
ggtitle("ROC Plot: Training and Test")
print(p_plot)
p_train <-
ggplot(results_train,
aes(m = Class1, d = major_gifter, color = type)) +
geom_roc(labelsize = 3.5,
cutoffs.at = c(0.7,0.5,0.3,0.1,0)) +
theme_minimal(base_size = 16) +
labs(x = "False Positive Rate",
y = "True Positive Rate") +
ggtitle("ROC Plot: Training and Test")
p_test <-
ggplot(results_test,
aes(m = Class1, d = major_gifter, color = type)) +
geom_roc(labelsize = 3.5,
cutoffs.at = c(0.7,0.5,0.3,0.1,0)) +
theme_minimal(base_size = 16) +
labs(x = "False Positive Rate",
y = "True Positive Rate") +
ggtitle("ROC Plot: Training and Test")
#Calculating AUC of both
print(calc_auc(p_train)$AUC)
[1] 0.8823867
print(calc_auc(p_test)$AUC)
[1] 0.8713205
RIDGE
library('glmnet')
library('glmnetUtils')
ridge_fit1 <- cv.glmnet(HH.Lifetime.Giving ~ sex_fct + donorseg_fct + No_of_Children,
data = data_train,
alpha = 0)
#Alpha 0 sets the Ridge
print(ridge_fit1)
print(ridge_fit1$lambda.min)
print(ridge_fit1$lambda.1se)
LASSO
coef(lasso_fit)
36 x 1 sparse Matrix of class "dgCMatrix"
s1
(Intercept) 5406.897
jobtitle_simpleAttorney .
jobtitle_simpleOwner .
jobtitle_simplePresident .
jobtitle_simpleTeacher .
jobtitle_simpleUnknown Position .
jobtitle_simpleOther .
deg1_simpleBachelor of Arts .
deg1_simpleBachelor of Fine Arts .
deg1_simpleBachelor of Science .
deg1_simpleMaster of Arts .
deg1_simpleNon Degree - Undergraduate .
deg1_simpleOther .
school1_simpleCollege of Health and Behavioral Sciences .
school1_simpleDonna Ford Attallah College of Educational Studies .
school1_simpleGeorge L. Argyros School of Business and Economics .
school1_simpleLawrence and Kristina Dodge Coll of Film & Media .
school1_simpleWilkinson Coll of Arts Humanities & Soc Sciences .
school1_simpleOther .
hhfirstgift_simpleChapman Annual Scholarship Fund .
hhfirstgift_simpleChapman Fund .
hhfirstgift_simpleJog-A-Thon .
hhfirstgift_simplePhonathon .
hhfirstgift_simplePre-SRN Conversion Gift History .
hhfirstgift_simpleOther .
maj1_simpleBusiness Administration BS .
maj1_simpleEducation .
maj1_simpleLaw (Full-Time) .
maj1_simpleUndecided - UG .
maj1_simpleUnknown Major .
maj1_simpleOther .
donorseg_simpleAt-Risk Donor .
donorseg_simpleCurrent Donor .
donorseg_simpleLapsed Donor .
donorseg_simpleLapsing Donor .
donorseg_simpleLost Donor .
#enet_mod <- cva.glmnet(dependent ~ indy1 + indy2,
# data = data,
# alpha = seq(0,1, by = 0.1))
#print(enet_mod)
#plot(enet_mod)
ELASTICNET
minlossplot(enet_mod,
cv.type = "min")
get_alpha <- function(fit) {
alpha <- fit$alpha
error <- sapply(fit$modlist,
function(mod) {min(mod$cvm)})
alpha[which.min(error)]
}
get_model_params <- function(fit) {
alpha <- fit$alpha
lambdaMin <- sapply(fit$modlist, `[[`, "lambda.min")
lambdaSE <- sapply(fit$modlist, `[[`, "lambda.1se")
error <- sapply(fit$modlist, function(mod) {min(mod$cvm)})
best <- which.min(error)
data.frame(alpha = alpha[best], lambdaMin = lambdaMin[best],
lambdaSE = lambdaSE[best], eror = error[best])
}
best_alpha <- get_alpha(enet_mod)
print(best_alpha)
get_model_params(enet_mod)
best_mod <- enet_mod$modlist[[which(enet_mod$alpha == best_alpha)]]
print(best_mod)
Ridges plot - could be useful for plotting donations vs donor segment
library('ggridges')
summary(data_train$variable)
ggplot(data_train, aes(x = HH.Lifetime.Giving, y = donorseg_fct)) + geom_density_ridges(rel_min_height = 0.005) + xlim(c(0, 400)) +
ggtitle("HH Lifetime Giving by Donor Segment")
library('corrplot')
#removing ID zip and nonnumeric
corrplot_data <- dataclean[-c(1:47,55:56,58:130)]
#Convert from character to numeric data type
convert_fac2num <- function(x){
as.numeric(as.factor(x))
}
corrplot_data <- mutate_at(corrplot_data,
.vars = c(1:8),
.funs = convert_fac2num)
#making a matrix
cd_cor <- cor(corrplot_data)
#creating correlation
col <- colorRampPalette(c("#BB4400", "#EE9990",
"#FFFFFF", "#77AAEE", "#4477BB"))
corrplot(cd_cor, method="color", col=col(100),
type="lower", addCoef.col = "black",
tl.pos="lt", tl.col="black",
tl.cex=0.7, tl.srt=45,
number.cex=0.7,
diag=FALSE)
Random Forest
library('randomForest')
rf_fit_donor <- randomForest(dependent ~ .,
data = data_train,
type = classification,
mtry = 7,
na.action = na.roughfix,
ntree = 200,
importance=TRUE
)
print(rf_fit_donor)
varImpPlot(rf_fit_donor, sort = TRUE,
n.var = 5,
type = 2, class = NULL, scale = TRUE,
main = deparse(substitute(rf_fit_donor)))
library('randomForestExplainer')
plot_min_depth_distribution(
rf_fit_donor,
k = 10,
min_no_of_trees = 0,
mean_sample = "top_trees",
mean_scale = FALSE,
mean_round = 2,
main = "Distribution of minimal depth and its mean"
)